In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
In [2]:
%load ../ud120-projects/final_project/poi_id.py
In [4]:
#%%writefile ../ud120-projects/final_project/poi_id.py
#!/usr/bin/python
import matplotlib.pyplot as plt
import sys
import pickle
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat
from feature_format import targetFeatureSplit
### features_list is a list of strings, each of which is a feature name
### first feature must be "poi", as this will be singled out as the label
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 'loan_advances',
'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value',
'expenses', 'exercised_stock_options', 'other', 'long_term_incentive',
'restricted_stock', 'director_fees', 'to_messages',
'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
'poi', 'shared_receipt_with_poi']
### load the dictionary containing the dataset
data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )
### we suggest removing any outliers before proceeding further
### if you are creating any new features, you might want to do that here
### store to my_dataset for easy export below
my_dataset = data_dict
### these two lines extract the features specified in features_list
### and extract them from data_dict, returning a numpy array
data = featureFormat(my_dataset, features_list)
### if you are creating new features, could also do that here
### split into labels and features (this line assumes that the first
### feature in the array is the label, which is why "poi" must always
### be first in features_list
labels, features = targetFeatureSplit(data)
### machine learning goes here!
### please name your classifier clf for easy export below
clf = None ### get rid of this line! just here to keep code from crashing out-of-box
### dump your classifier, dataset and features_list so
### anyone can run/check your results
pickle.dump(clf, open("../ud120-projects/final_project/my_classifier.pkl", "w") )
pickle.dump(data_dict, open("../ud120-projects/final_project/my_dataset.pkl", "w") )
pickle.dump(features_list, open("../ud120-projects/final_project/my_feature_list.pkl", "w") )
In [55]:
data_dict = pickle.load(open("../ud120-projects/final_project/my_dataset.pkl", "r") )
In [56]:
#[v for k,v in data_dict.items()][0]
In [57]:
data_dict.items()[0]
Out[57]:
In [58]:
df = pd.DataFrame.from_dict(my_dataset, orient='index')
In [59]:
#%load ../ud120-projects/tools/feature_format.py
In [60]:
df.head()
Out[60]:
In [61]:
df['salary'].unique()
Out[61]:
'NaN' was imported as a string instead of a a missing value. We will convert these to NaN type and look how many missing values our data has.
In [62]:
df = df.replace('NaN', np.nan)
In [63]:
df.info()
There is a lot of missing data!
In [13]:
print "NaN - Missing values:"
len(df.index)-df.count()
Out[13]:
First, check for potential invalid people in the dataset by looking at names without a " ".
In [64]:
[suspect for suspect in df.index if " " not in suspect]
Out[64]:
TOTAL is an aggregate category, and not a person's name. This should be removed.
In [65]:
df = df.drop('TOTAL', axis=0)
Next, we'll look at names of people who only have 3 or less feature entries (one of which is simply True/False for poi and not a feature) out of 21 features. One happens to be a Travel Agency, and others are missing nearly all entries as well.
These are good candidates for potential removal.
In [66]:
print [ind for ind in enumerate(df.T.count()) if ind[1] <= 3]
df.irow([56, 84, 127, 137, 142])
Out[66]:
In [67]:
#df.columns
#df = df.drop(['Name'], axis=1)
df = df.drop(['GRAMM WENDY L', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E', 'WHALEY DAVID A', 'WROBEL BRUCE'], axis=0)
Email address is also not needed for this model as it is a unique string for each person.
In [68]:
df = df.drop(['email_address'], axis=1)
First, we must deal with the NaN's since many models don't like missing values. For a quick and dirty solution, we will just fill in 0's for missing values.
This is just to get a model up and running, and will be handled differently later.
In [19]:
# Cross-validation fills
df = df.fillna(0)
In [20]:
from sklearn.cross_validation import train_test_split
In [21]:
labels = df['poi']
features = df.drop('poi', axis=1)
features_train, features_test, labels_train, labels_test = train_test_split(features, labels,
test_size=0.2,
random_state=808)
In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
In [23]:
param_grid = [{'C':[.0001, .001, .01, 0.1, 1, 10, 100, 1000], 'gamma': [10, 1, .1, .01, .001, .0001]}]
In [24]:
from sklearn import grid_search
In [25]:
svm_model = SVC()
clf = grid_search.GridSearchCV(svm_model, param_grid, n_jobs=4, scoring='f1')
In [25]:
In [26]:
clf.fit(features_train, labels_train)
Out[26]:
In [27]:
clf.best_estimator_
Out[27]:
In [28]:
clf.best_score_
Out[28]:
In [29]:
clf.scorer_
Out[29]:
In [30]:
from sklearn.ensemble import RandomForestClassifier
In [31]:
rf = RandomForestClassifier(n_estimators=1000, n_jobs=4)
In [32]:
rf.fit(features_train, labels_train)
Out[32]:
In [33]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
In [34]:
pred = rf.predict(features_test)
print "Accuracy:", accuracy_score(labels_test, pred), '\n'
print "Confusion Matrix:\n", confusion_matrix(labels_test, pred), '\n'
print "Classification Report:", classification_report(labels_test, pred)
In [35]:
features = np.array(features)
labels = np.array(labels)
In [36]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold( labels, n_folds=3 )
precisions = []
recalls = []
for train_idx, test_idx in skf:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
rf.fit(features_train, labels_train)
pred = rf.predict(features_test)
### for each fold, print some metrics
print
print "precision score: ", precision_score( labels_test, pred )
print "recall score: ", recall_score( labels_test, pred )
precisions.append( precision_score(labels_test, pred) )
recalls.append( recall_score(labels_test, pred) )
### aggregate precision and recall over all folds
print "average precision: ", sum(precisions)/3.
print "average recall: ", sum(recalls)/3.
In [ ]:
#%load ../ud120-projects/final_project/tester.py
In [37]:
#!/usr/bin/pickle
""" a basic script for importing student's POI identifier,
and checking the results that they get from it
requires that the algorithm, dataset, and features list
be written to my_classifier.pkl, my_dataset.pkl, and
my_feature_list.pkl, respectively
that process should happen at the end of poi_id.py
"""
import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
### load up student's classifier, dataset, and feature_list
clf = pickle.load(open("my_classifier.pkl", "r") )
dataset = pickle.load(open("my_dataset.pkl", "r") )
feature_list = pickle.load(open("my_feature_list.pkl", "r"))
### print basic info about the algorithm/parameters used
print clf
### prepare data for training/testing
data = featureFormat(dataset, feature_list)
labels, features = targetFeatureSplit(data)
### stratified k-fold cross-validation is a form of
### CV where instances of each class are equally apportioned--
### e.g. if you have 10% of one class and 90% of the other,
### stratification means each fold will have 10% of one
### class and 90% of the other
###
### this is helpful when you don't have a lot of instances
### of one class or the other, because in that case the
### low-frequency class can become lopsided in the training-test
### split skew the results
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold( labels, n_folds=3 )
precisions = []
recalls = []
for train_idx, test_idx in skf:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
### for each fold, print some metrics
print
print "precision score: ", precision_score( labels_test, pred )
print "recall score: ", recall_score( labels_test, pred )
precisions.append( precision_score(labels_test, pred) )
recalls.append( recall_score(labels_test, pred) )
### aggregate precision and recall over all folds
print "average precision: ", sum(precisions)/3.
print "average recall: ", sum(recalls)/3.
#print precision_score( labels_test, pred )
#print recall_score( labels_test, pred )
In [38]:
df.head()
Out[38]:
In [39]:
#df[(df.poi == True)].email_address
In [43]:
df.info()
In [46]:
df.describe()
Out[46]:
In [47]:
import matplotlib.pyplot as plt
In [81]:
plt.plot(df.salary.fillna(df.to_messages.median()))
plt.plot(df[df.poi==True].salary)
Out[81]:
In [78]:
df.apply(lambda x: x.fillna(x.median()), axis=0).describe()
Out[78]:
In [74]:
df.info()
In [101]:
plt.plot(df.long_term_incentive, 'ro')
plt.plot(df[df.poi==True].long_term_incentive, 'bo')
Out[101]:
In [108]:
df1 = df.drop(['deferral_payments', 'restricted_stock_deferred', 'loan_advances', 'director_fees'], axis=1)
In [117]:
f1 = df1.drop(['poi'], axis=1)
y1 = df['poi']
In [119]:
from sklearn.preprocessing import scale
In [122]:
f1 = f1.apply(lambda x: x.fillna(x.median()), axis=0)
f_scaled = scale(f1)
In [125]:
from sklearn.decomposition import RandomizedPCA
In [128]:
pca = RandomizedPCA(n_components=5, whiten=True).fit(f_scaled)
In [129]:
x_pca = pca.transform(f_scaled)
In [142]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
param_grid = {
'C': [.01, .1, 1, 10, 100, 1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 1, 10, 100, 1000],
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid,n_jobs=4)
clf = clf.fit(x_pca, y1)
In [143]:
print clf.best_estimator_
In [145]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(x_pca)
print classification_report(y1, y_pred)
print confusion_matrix(y1, y_pred)
In [ ]: